PC Open 107

home *** CD-ROM | disk | FTP | other *** search

/ PC Open 107 / PC Open 107 CD 1.bin / CD1 / INTERNET / COPIA SITI / Getleft / getleft-setup-notcl.exe / {app} / scripts / HtmlParser.tcl < prev next >

Wrap

Text File | 2004-03-09 | 22.7 KB | 708 lines

############################################################################### ############################################################################### # HtmlParser.tcl ############################################################################### ############################################################################### # In this file are implemented the procedures used to parse Html file links. ############################################################################### ############################################################################### # Copyright 2000-2004 AndrΘs Garcφa Garcφa -- fandom@retemail.es # Distributed under the terms of the GPL v2 ############################################################################### ############################################################################### namespace eval HtmlParser { ############################################################################### # SetEntities # Initializes the arrays with the translation for Html entities, something # like 'entity(lt)==>' ############################################################################### proc SetEntities {} { variable entities set entities(quot) \" set entities(amp) \\& set entities(lt) < set entities(gt) > set entities(nbsp) {} set entities(iexcl) í set entities(cent) ó set entities(pound) ú set entities(curren) ñ set entities(yen) Ñ set entities(brvbar) \| set entities(sect) º set entities(uml) ¿ set entities(copy) ⌐ set entities(ordf) ¬ set entities(laquo) ½ set entities(not) ¼ set entities(shy) ¡ set entities(reg) « set entities(macr) » set entities(deg) ░ set entities(plusmn) ▒ set entities(sup2) ▓ set entities(sup3) │ set entities(acute) ┤ set entities(micro) ╡ set entities(para) ╢ set entities(middot) ╖ set entities(cedil) ╕ set entities(sup1) ╣ set entities(ordm) ║ set entities(raquo) ╗ set entities(frac14) ╝ set entities(frac12) ╜ set entities(frac34) ╛ set entities(iquest) ┐ set entities(ntilde) ± set entities(Agrave) └ set entities(Aacute) ┴ set entities(Acirc) ┬ set entities(Atilde) ├ set entities(Auml) ─ set entities(Aring) ┼ set entities(AElig) ╞ set entities(Ccedil) ╟ set entities(Egrave) ╚ set entities(Eacute) ╔ set entities(Ecirc) ╩ set entities(Euml) ╦ set entities(Igrave) ╠ set entities(Iacute) ═ set entities(Icirc) ╬ set entities(Iuml) ╧ set entities(ETH) ╨ set entities(Ntilde) ╤ set entities(Ograve) ╥ set entities(Oacute) ╙ set entities(Ocirc) ╘ set entities(Otilde) ╒ set entities(Ouml) ╓ set entities(times) ╫ set entities(Oslash) ╪ set entities(Ugrave) ┘ set entities(Uacute) ┌ set entities(Ucirc) █ set entities(Uuml) ▄ set entities(Yacute) ▌ set entities(THORN) ▐ set entities(szlig) ▀ set entities(agrave) α set entities(aacute) ß set entities(acirc) Γ set entities(atilde) π set entities(auml) Σ set entities(aring) σ set entities(aelig) µ set entities(ccedil) τ set entities(egrave) Φ set entities(eacute) Θ set entities(ecirc) Ω set entities(euml) δ set entities(igrave) ∞ set entities(iacute) φ set entities(icirc) ε set entities(iuml) ∩ set entities(eth) ≡ set entities(ntilde) ± set entities(ograve) ≥ set entities(oacute) ≤ set entities(ocirc) ⌠ set entities(otilde) ⌡ set entities(ouml) ÷ set entities(divide) ≈ set entities(oslash) ° set entities(ugrave) ∙ set entities(uacute) · set entities(ucirc) √ set entities(uuml) ⁿ set entities(yacute) ² set entities(thorn) ■ set entities(yuml) return } ############################################################################### # ShowLinks # Show the links found in the last preprocessed page, it's only good for # debugging. ############################################################################### proc ShowLinks {} { variable nLinks variable links for {set i 1} {$i<$nLinks} {incr i} { set description [TidyDescription $links($i,descrip) $links($i,url)] if {[info exists links($i,type)]} { puts "\n$i: $links($i,file) - Bajar: $links($i,ok)" puts "$description - $links($i,type)" puts "$links($i,url)" } else { puts "\n$i: $links($i,file) - Bajar: $links($i,ok)" puts "$description" puts "$links($i,url)" } } return } ############################################################################### # ParseUrl # Given an url 'ParseUrl' will split it in its parts: protocol, domain, # directory and filename # # Parameter # The url to be parsed, # # Returns # A list with the url split as mentioned above or '1' if the url couldn't be # parsed. ############################################################################### proc ParseUrl {url} { if {[regexp -nocase \ {(([^:]*)(?:://))?([^/]+)(((?:~[^/]*)?(?:[^\?]*))(?:/)([^#]*))?} \ $url nada nada protocol domain nada dir fileName]} { if {$protocol==""} { set protocol http } return [list $protocol $domain $dir $fileName] } return 1 } ############################################################################### # TidyDir # Takes things like ".." and "." from the absolute path. # # Parameter: # File path. # # Returns: # The tidied file path. ############################################################################### proc TidyDir {path} { if {[regexp {\.$} $path]} { append path / } for {set a 1 ; set b 1} {($a>0)||($b>0)} {} { set a [regsub -all {/\./} $path {/} path] set b [regsub -all {([^./]+/\.\./)} $path {} path] } for {} {[regsub {^/\.\.} $path {} path]} {} {} return $path } ############################################################################### # RemoveEntities # Given a link or a link description, this procecedure subtitutes the # Html character entities for the real thing, for example '&' gets # changed to '&'. # # Parameter # The string to process. # # Returns # The string processed. ############################################################################## proc RemoveEntities {string} { variable entities while {[regexp {(?:&)([^ ;]+)(;)?} $string old entity]} { regsub {#} $entity {} entity # Eventually this should be replaced with "string is number" if {[regexp {^[0-9]+$} $entity]} { if {[catch {format %c $entity} new]} { break } regsub -all {([\\])} $new {\\\1} new } else { if {[catch {set ::HtmlParser::entities($entity)} new]} { break } } if {$new=="&"} { set new "\\&" } regsub -all $old $string $new string } return $string } ############################################################################### # TidyLinks # Removes Html character entities from the links. It seems that if a # file is, for example, called 'me&you.jpg' some webmasters or Html editors # will put 'me&you.jpg' in the link. # # Side efects # The links in 'links' will contain no character entities ################################################################################ proc TidyLinks {} { variable nLinks variable links for {set i 1} {$i<$nLinks} {incr i} { set links($i,file) [RemoveEntities $links($i,file)] } return } ############################################################################### # TidyDescription # Translates for human eyes the description of the links. # # Parameters: # description: The description to be translated. # url: The url for the translation # # Returns: # The description translated. ############################################################################### proc TidyDescription {description url} { global labelDialogs variable entities variable nLinks variable links if {[regexp {^[\s]*$} $description]} { return $url } if {[regexp -nocase {(<img)} $description]} { regsub -all {<.*?>} $description {} tmp if {![regexp {^\s*$} $tmp]} { set description $tmp } else { set description [GetFileName $description alt] if {$description==1} { set description $url } set description "$labelDialogs(linkImage): $description" } } regsub -all {<.*?>} $description {} description set description [RemoveEntities $description] regsub -all {\s+} $description { } description regsub {^\s} $description {} description if {$description==""} { return $url } return $description } ############################################################################### # ChangeEncoding # Changes the encoding in which the description of the links are written # # Parameters: # newEncoding: The encoding to use. ############################################################################### proc ChangeEncoding {newEncoding} { variable nLinks variable links global dirGetleft for {set i 1} {$i<$nLinks} {incr i} { set links($i,descrip) \ [encoding convertfrom "$newEncoding" $links($i,descrip)] } return } ############################################################################### # GetFileName # Extrac the filename of the link, or the description in the 'alt' field # from whatever it gets send. # # Parameter # tag: the string to process. # type: what we are looking for: "href", "src" or "alt". # # Returns: # The filename or '1' if none was found. ############################################################################### proc GetFileName {tag type} { regsub -all {\s*=\s*} $tag {=} tag if {[regexp -nocase -expanded [subst -nocommand { (?:$type=) (?:(?:(?:\")([^\"]+))| # Filename between "" (?:(?:\')([^\']+))| # Between '' ([^\ \"'>]+)) # No delimiter }] $tag nada a b c]} { if {$a!=""} { set fileName $a } elseif {$b!=""} { set fileName $b } elseif {$c!=""} { set fileName $c } if {[regexp {^(&)(.*)(;)$} $fileName]} { return 1 } if {[regexp -nocase {^mailto:|^news:|^javascript:} $fileName]} { return 1 } # Workaround for Webmasters that don't know the directory separator # is a single / or feel like using the windows one \. regsub -all {([^:])(//)|(\\)} $fileName {\1/} fileName # It seems you can write a link as //site.com/file.html regsub {^//} $fileName {http://} fileName # The following is due to Javascript variables. return $fileName } # If the link happens to be written like: <a href="">index.html</a> if {[regexp -nocase [subst -nocommand {(?:$type=)((\"\")|(\'\'))}] $tag]} { return "" } return 1 } ############################################################################### # CompleteString # Reads from the channel 'leer' until the 'cosa' string includes the # substring passed as a parameter. # # Parameters: # cadena: name of the variable with the string to complete. # leer: channel to read from. # pattern: substring to look for in the channel. # # Returns: # - '0': No errors. # - '1': The string could not be completed. # # Side efects: # 'cosa' is completed. ############################################################################### proc CompleteString {cadena leer pattern} { upvar $cadena cosa while {![regexp -nocase [subst -nocommand {$pattern}] $cosa]} { append cosa [read $leer 20] if {[eof $leer]} { return 1 } } return 0 } ############################################################################### # Parsing # Reads the Web page passed as a parameter and proccess it to extract # all links. # # Parameters: # file: File which contains the page to process. # referer: The referer link for the page. # level: The level in which we found the file to parse. # # Returns: # - '0': No errors. # - '1': Couldn't open file. # # Side efects: # 'nLinks': number of links plus one. # 'links' : keeps all the info about the links. ############################################################################### proc Parsing {file referer level} { global labelDialogs labelTitles getleftOptions variable nLinks variable links variable pageEncoding variable baseTag if {[string match $file ""]} return set nLinks 1 catch {unset links} if {[catch {open $file r} leer]} { return 1 } catch {unset linkType} set pageEncoding "" for {set cosa ""; set thumbnailNext 0 ; set newBase "" ; set baseTag ""} \ {(![eof $leer]) || ([string compare $cosa ""])} {} { if {$getleftOptions(pauseNow)==1} { tkwait variable getleftOptions(pauseNow) } if {![regexp -nocase {((<)(a|b|l|i|f|s|m|t)(.*))|(<$)} $cosa cosa]} { set cosa [read $leer 50] continue } set result [CompleteString cosa $leer ">"] if {$result==1} break regexp {(?:<)([^>]*)(?:>)(.*)} $cosa nada tag cosa set lowerTag [string tolower $tag] set fileName 1 switch -regexp $lowerTag { "^area" { if {[set fileName [GetFileName $tag href]]!=1} { if {[set descrip [GetFileName $tag alt]]!=1} { set links($nLinks,descrip) $descrip } else { set links($nLinks,descrip) "$labelDialogs(map)" } } } "^a.+href" { if {[set fileName [GetFileName $tag href]]!=1} { if {[regexp -nocase {(javascript)(.*)(\()} $fileName]} continue # It so happens you can skip closing the link, browsers # consider them closed if you open another one. set result [CompleteString cosa $leer {(<)((/a)|(a))}] if {$result==1} break regexp -nocase {(.*?)(?:(<)((/a)|(a)))} $cosa nada descripcion set links($nLinks,descrip) $descripcion if {[string match $descripcion ""]} { set fileName 1 } if {[regexp -nocase {<img[^>]* src} $descripcion]} { set thumbnailNext 1 } else { regexp {(.*?)(<)} $descripcion descripcion } } } "^img" { if {[set fileName [GetFileName $tag src]]!=1} { if {[set descrip [GetFileName $tag alt]]==1} { set descrip $fileName } set links($nLinks,descrip) "$labelDialogs(image): $descrip" if {$thumbnailNext==0} { set links($nLinks,type) image } else { set links($nLinks,type) thumb set thumbnailNext 0 } } } ^script { if {![regexp -nocase {/script>} $cosa]} { set result [CompleteString cosa $leer "/script>"] if {$result==1} break } regexp -nocase {(?:/script>)(.*)} $cosa nada cosa if {[set fileName [GetFileName $tag src]]!=1} { set links($nLinks,descrip) "Script: $fileName" } } "^frame" { if {[set fileName [GetFileName $tag src]]!=1} { set links($nLinks,descrip) "$labelDialogs(frame): $fileName" } } "^base" { if {[set fileName [GetFileName $tag href]]!=1} { set baseTag $tag set newBase [CompleteUrl $fileName $referer ""] set fileName 1 } } "^link.+href" { if {[set fileName [GetFileName $tag href]]!=1} { set links($nLinks,descrip) "$labelDialogs(css)" } } ^meta { if {![regexp {(?:meta *)(?:charset=)(?:\"|')?([^\"' ]*)} $tag \ nada pageEncoding]} { if {[set fileName [GetFileName $tag url]]!=1} { set links($nLinks,descrip) \ "$labelDialogs(relocation): $fileName" } } } ^table|^td|^th|^tr|^layer|^ilayer|^body { if {[set fileName [GetFileName $tag background]]!=1} { set links($nLinks,descrip) "$labelDialogs(image): $fileName" } } } if {$fileName!=1} { # set newName $fileName if {![regexp {^#} $fileName]} { set links($nLinks,file) $fileName set url [CompleteUrl $fileName $referer $newBase] set links($nLinks,url) $url incr nLinks } } } close $leer return 0 } ############################################################################### # FilterLinks # Filters the links extracted from a page according to the rules given. # # Parameters # referer: The url of the page we got the links from. # linkArray: Name of the array where the links are stored # level: The recursion level in which we found the referer. # externalLevel: and the recursion level for links outside de domain. # # Side effects: # 'nLinks' and 'links' are upated to the new, filtered links ############################################################################### proc FilterLinks {referer linkArray level {externalLevel 0}} { global downOptions siteUrl directories urlsDownloaded upvar #0 $linkArray links set baseSite $siteUrl(www) regexp {(.*)(:)} $siteUrl(www) nada baseSite for {set i 1} {[info exists links($i,url)]} {incr i} { set link $links($i,url) set links($i,ok) 1 if {([regexp {\.ram$} $link])} { set links($i,ok) 0 continue } if {($downOptions(filter)!="")&&([regexp -nocase "$downOptions(filter)" \ $links($i,file)])} { set links($i,ok) 0 continue } if {([regexp {\?} $link])&&($downOptions(cgi)==0)} { set links($i,ok) 0 continue } if {[info exists urlsDownloaded($link)]} { set links($i,ok) 0 continue } if {[regexp {^ftp:.*/$} $link]} { set links($i,ok) 0 continue } if {[regexp -nocase {^https://} $link]} { set links($i,ok) 0 continue } set parsedUrl [ParseUrl $link] set protocol [lindex $parsedUrl 0] set direccion_www [lindex $parsedUrl 1] set directory [lindex $parsedUrl 2] if {![info exists direccion_www]} { set links($i,ok) 0 ; # Maybe something should be put in the error \ log about this. continue } # www.domain.com and www.domain.com:8080 will be considered the # same site. set linkSite $direccion_www regexp {(.*)(:)} $direccion_www nada linkSite if {[string compare [string tolower $baseSite] \ [string tolower $linkSite]]} { if {$downOptions(exLevels)<=$externalLevel} { set links($i,ok) 0 continue } set externalLink 1 } else { set externalLink 0 } if {($downOptions(dir)==0)&&($siteUrl(base)!="")&&($externalLink==0)} { if {![regexp -nocase "^$siteUrl(base)" $directory]} { set links($i,ok) 0 continue } } if {($downOptions(levels)!=-1)&&($downOptions(levels)<=$level)} { set links($i,ok) 0 continue } set remove 0 catch { if {($links([expr {$i+1}],type)=="thumb")&&($downOptions(images)==1)} { if {[regexp -nocase {(gif$)|(jpg$)|(jpeg$)|(bmp$)|(xbm$)|(tiff$)|(png$)}\ $links($i,file)]} { set remove 1 } } } catch { if {($links($i,type)=="thumb")&&($downOptions(images)==2)} { set remove 1 } } if {$remove==1} { set links($i,ok) 0 continue } } return } ############################################################################### # CompleteUrl # Given a link, this procedure returns the full Url of that link, for # example, a link from a page may be '../index.html', this procedure # will return something like 'http://www.algo.es/cosas/index.html' # # Parameter # link: I'll let you guess # referer: url of the referrer page for the link # newBase: In case the page contains a 'BASE' tag, this will have the # url to use as base for the links. # # Returns # The url ############################################################################### proc CompleteUrl {link referer newBase} { global siteUrl set link [RemoveEntities $link] if {[regexp {://} $link]} { # cgi links may have a http:// and still be relative if {![regexp {(\?)(.*)(://)} $link]} { return $link } } if {$newBase==""} { set parsedUrl [ParseUrl $referer] } else { set parsedUrl [ParseUrl $newBase] } set prot [lindex $parsedUrl 0] set domain [lindex $parsedUrl 1] set dir [lindex $parsedUrl 2] if {[regexp {(?::/)([^/].*)} $link nada fileName]} { set url $prot://$domain/$fileName return $url } if {[regexp {^/} $link]} { set url $prot://$domain$link return $url } set fileName [TidyDir $dir/$link] set url "$prot://$domain$fileName" return $url } SetEntities }